library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(plotlyutils)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(GGally) ; library(ggpubr)
library(Rtsne)
library(rstatix)
library(expss)
library(ClusterR)
library(DESeq2) ; library(biomaRt)
library(knitr)
# Load Gandal dataset
load('./../Data/preprocessedData/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
Diagnosis and brain region seem to be balanced except for the frontal lobe, where there are more control samples than ASD ones
table_info = datMeta %>% apply_labels(Diagnosis = 'Diagnosis', Brain_lobe = 'Brain Lobe',
Batch = 'Batch', Sex = 'Gender')
cro(table_info$Diagnosis, list(table_info$Brain_lobe,total()))
|  Brain Lobe |  #Total | |||||
|---|---|---|---|---|---|---|
|  Frontal |  Occipital |  Parietal |  Temporal |  | ||
|  Diagnosis | ||||||
| Â Â Â CTLÂ | 13 | 8 | 8 | 6 | Â | 35 |
| Â Â Â ASDÂ | 8 | 13 | 12 | 12 | Â | 45 |
|    #Total cases | 21 | 21 | 20 | 18 |  | 80 |
There are many more Male than Female samples, but Diagnosis and Gender seem to be balanced
cro(table_info$Diagnosis, list(table_info$Sex, total()))
|  Gender |  #Total | |||
|---|---|---|---|---|
| Â FÂ | Â MÂ | Â | ||
|  Diagnosis | ||||
| Â Â Â CTLÂ | 6 | 29 | Â | 35 |
| Â Â Â ASDÂ | 9 | 36 | Â | 45 |
|    #Total cases | 15 | 65 |  | 80 |
The difference in age between diagnosis groups is not statistically significant. The box plots look different between them but its probably because they have very few points each (35 and 45), so they are a bit noisy, looking at the individual points it seems like the main difference could be a few very young samples in the ASD group.
datMeta %>% ggplot(aes(Diagnosis, Age, fill = Diagnosis)) + geom_boxplot() +
geom_jitter(color='gray', size=2, alpha = 0.5) +
stat_compare_means(label = 'p.signif', method = 't.test', method.args = list(var.equal = FALSE)) +
theme_minimal() + theme(legend.position = 'none')
The first principal component separates the samples by diagnosis groups perfectly, there is no recognisable pattern in the samples coloured by brain lobe
pca = datExpr %>% t %>% prcomp
plot_data = pca$x %>% data.frame %>% dplyr::select(PC1:PC2) %>%
mutate('ID'= datExpr %>% colnames %>% substring(2)) %>%
left_join(datMeta, by=c('ID'='Dissected_Sample_ID')) %>%
dplyr::select('ID','PC1','PC2','Diagnosis','Brain_lobe') %>%
mutate('Diagnosis' = factor(Diagnosis, levels=c('CTL','ASD')))
p1 = plot_data %>% ggplot(aes(PC1, PC2, color=Diagnosis, shape=Diagnosis)) + geom_point(alpha = 0.8) +
xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)')) +
ggtitle('Samples coloured by Diagnosis') + coord_fixed() + theme_minimal()
p2 = plot_data %>% ggplot(aes(PC1, PC2, color=Brain_lobe, shape=Brain_lobe)) + geom_point(alpha = 0.8) +
xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)')) +
ggtitle('Samples coloured by Brain Lobe') + coord_fixed() + labs(color = 'Brain Lobe', shape = 'Brain Lobe') +
theme_minimal()
grid.arrange(p1, p2, nrow = 1)
rm(pca, p1, p2)
No matter the value of the perplexity parameter, Diagnosis is always an important factor in the way samples are positioned
perplexities = c(2,5,10,20)
ps = list()
for(i in 1:length(perplexities)){
set.seed(123)
tsne = datExpr %>% t %>% Rtsne(perplexity=perplexities[i])
plot_data = data.frame('ID'=colnames(datExpr), 'C1'=tsne$Y[,1], 'C2'=tsne$Y[,2]) %>%
mutate('ID'=substring(ID,2)) %>% left_join(datMeta, by=c('ID'='Dissected_Sample_ID')) %>%
dplyr::select('C1','C2','Diagnosis','Subject_ID') %>%
mutate('Diagnosis'=factor(Diagnosis, levels=c('CTL','ASD')))
ps[[i]] = plot_data %>% ggplot(aes(C1, C2, color=Diagnosis, shape=Diagnosis)) + geom_point(alpha = 0.7) + theme_minimal() +
ggtitle(paste0('Perplexity = ',perplexities[i])) + theme(legend.position='none') + coord_fixed()
}
grid.arrange(grobs=ps, nrow=2)
No matter the value of the perplexity parameter, brain lobe doesn’t seem to play any role in the way the points are positioned
perplexities = c(2,5,10,20)
ps = list()
for(i in 1:length(perplexities)){
set.seed(123)
tsne = datExpr %>% t %>% Rtsne(perplexity=perplexities[i])
plot_data = data.frame('ID'=colnames(datExpr), 'C1'=tsne$Y[,1], 'C2'=tsne$Y[,2]) %>%
mutate('ID'=substring(ID,2)) %>% left_join(datMeta, by=c('ID'='Dissected_Sample_ID')) %>%
dplyr::select('C1','C2','Brain_lobe','Subject_ID')
ps[[i]] = plot_data %>% ggplot(aes(C1, C2, color=Brain_lobe, shape=Brain_lobe)) + geom_point(alpha = 0.7) + theme_minimal() +
ggtitle(paste0('Perplexity = ',perplexities[i])) + theme(legend.position='none') + coord_fixed()
}
grid.arrange(grobs=ps, nrow=2)
As we saw above, the big clusters of points generally correspond to diagnosis groups, but we can see there are smaller clusters of two or three points. When we colour the samples by diagnosis we can see these small clusters correspond to the different samples extracted from the same subject.
perplexities = c(2,5,10,20)
ps = list()
for(i in 1:length(perplexities)){
set.seed(123)
tsne = datExpr %>% t %>% Rtsne(perplexity=perplexities[i])
plot_data = data.frame('ID'=colnames(datExpr), 'C1'=tsne$Y[,1], 'C2'=tsne$Y[,2]) %>%
mutate('ID'=substring(ID,2)) %>% left_join(datMeta, by=c('ID'='Dissected_Sample_ID')) %>%
dplyr::select('C1','C2','Brain_lobe','Subject_ID')
ps[[i]] = plot_data %>% ggplot(aes(C1, C2, color=Subject_ID)) + geom_point(alpha = 0.6) + theme_minimal() +
ggtitle(paste0('Perplexity = ',perplexities[i])) + theme(legend.position='none') + coord_fixed()
}
grid.arrange(grobs=ps, nrow=2)
Interactive version of the last plot above so we can corroborate the subject ID’s from each sample (because having so many different subject IDs, subjects assigned similar colours may be confused)
ggplotly(plot_data %>% ggplot(aes(C1, C2, color=Subject_ID)) + geom_point(aes(id=Subject_ID)) + theme_minimal() +
theme(legend.position='none') + ggtitle('t-SNE Perplexity = 20 coloured by Subject ID'))
The first principal component explains over 99% of the variance in the dataset and is strongly related to the mean level of expression of the genes
pca = datExpr %>% prcomp
plot_data = data.frame( 'PC1' = pca$x[,1], 'PC2' = pca$x[,2], 'MeanExpr'=rowMeans(datExpr))
plot_data %>% ggplot(aes(PC1, PC2, color=MeanExpr)) + geom_point(alpha=0.3) + theme_minimal() +
scale_color_viridis() + ggtitle('PCA') + coord_fixed() +
xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)')) +
theme(legend.position = 'bottom', legend.key.width=unit(3, 'cm')) + labs(color='Mean Expression ')
The correlation between the mean expression of the genes and the 1st principal component is 1
Independently of the perplexity parameter, the mean level of expression of the genes seems to continue to be the main feature that characterises the genes
perplexities = c(1,2,5,10,50,100)
ps = list()
for(i in 1:length(perplexities)){
file_name = paste0('./../Data/Results/tsne_perplexity_',perplexities[i],'.csv')
# Check if file with t-sne coordinates already exists, if not, calculate coordinates
if(file.exists(file_name)){
tsne = read.csv(file_name)
} else {
set.seed(123)
tsne = datExpr %>% Rtsne(perplexity=perplexities[i])
tsne_coords = cbind(tsne$Y, rownames(datExpr))
colnames(tsne_coords) = c('C1','C2','ID')
write.csv(tsne_coords, file_name, row.names=F)
tsne = tsne_coords
}
# Plot results
plot_data = data.frame('C1'=tsne[,1], 'C2'=tsne[,2], 'MeanExpr'=rowMeans(datExpr))
ps[[i]] = plot_data %>% ggplot(aes(C1, C2, color=MeanExpr)) + geom_point(alpha=0.5) + theme_minimal() + coord_fixed() +
scale_color_viridis() + ggtitle(paste0('Perplexity = ', perplexities[i])) + theme(legend.position='none')
}
grid.arrange(grobs=ps, nrow=2)
Samples belonging to the ASD group have statistically significantly higher levels of expression than the Control group (this was unexpected)
plot_data = data.frame('ID'=colnames(datExpr), 'Mean'=colMeans(datExpr)) %>%
mutate('ID'=substring(ID,2)) %>% left_join(datMeta, by=c('ID'='Dissected_Sample_ID'))
plot_data %>% ggplot(aes(Diagnosis, Mean, fill = Diagnosis)) +
geom_boxplot(outlier.colour='gray', outlier.shape='o', outlier.size=3) +
stat_compare_means(label = 'p.signif', method = 't.test', method.args = list(var.equal = FALSE)) +
ylab('Mean Expression') + theme_minimal() + theme(legend.position = 'none')
Samples extracted from the frontal lobe have a slightly lower mean level of expression than the samples from the other lobes, but this difference is not statistically significant
wt = plot_data %>% wilcox_test(Mean~Brain_lobe, p.adjust.method='BH') %>% add_x_position(x = 'group')
increase = .006
base = 8.175
pos_y_comparisons = c(base, base+increase, base+3*increase, base, base+2*increase, base)
p1 = plot_data %>% ggplot(aes(Brain_lobe, Mean)) +
geom_boxplot(outlier.colour='gray', outlier.shape='o', outlier.size=3, aes(fill = Brain_lobe)) +
stat_pvalue_manual(wt, y.position = pos_y_comparisons, tip.length = .02) +
xlab('Brain Lobe') + ylab('Mean Expression') + theme(legend.position = 'none') +
theme_minimal() + theme(legend.position = 'none')
p2 = plot_data %>% mutate(frontal_lobe = ifelse(Brain_lobe == 'Frontal', 'Frontal Lobe', 'Other lobes')) %>%
ggplot(aes(frontal_lobe, Mean, fill = frontal_lobe)) +
geom_boxplot(outlier.colour='gray', outlier.shape='o', outlier.size=3) +
stat_compare_means(label = 'p.signif', method = 't.test', method.args = list(var.equal = FALSE)) +
xlab('Brain Lobe') + ylab('Mean Expression') + theme(legend.position = 'none') +
theme_minimal() + theme(legend.position = 'none')
grid.arrange(p1, p2, nrow=1, widths = c(0.6,0.4))
There are 4473 DE genes using a threshold of LFC=0 (~28% of the total number of genes). As the threshold increases, the number of DE genes quickly decreases.
lfc_list = seq(0, 0.3, 0.01)
n_genes = nrow(datExpr)
# PCA
datExpr_pca_samps = datExpr %>% data.frame %>% t %>% prcomp
# Initialise data frame to save PCA outputs
pcas_samps = datExpr_pca_samps$x %>% data.frame %>% dplyr::select(PC1:PC2) %>%
mutate('ID'=colnames(datExpr), 'lfc'=-1, PC1=scale(PC1), PC2=scale(PC2))
pca_samps_old = pcas_samps
for(lfc in lfc_list){
# Recalculate DE_info with the new threshold (p-values change) an filter DE genes
DE_genes = results(dds, lfcThreshold=lfc, altHypothesis='greaterAbs') %>% data.frame %>%
mutate('ID'=rownames(.)) %>% filter(padj<0.05)
datExpr_DE = datExpr %>% data.frame %>% filter(rownames(.) %in% DE_genes$ID)
n_genes = c(n_genes, nrow(DE_genes))
# Calculate PCAs
datExpr_pca_samps = datExpr_DE %>% t %>% prcomp
# Create new DF entries
pca_samps_new = datExpr_pca_samps$x %>% data.frame %>% dplyr::select(PC1:PC2) %>%
mutate('ID'=colnames(datExpr), 'lfc'=lfc, PC1=scale(PC1), PC2=scale(PC2))
# Change PC sign if necessary
if(cor(pca_samps_new$PC1, pca_samps_old$PC1)<0) pca_samps_new$PC1 = -pca_samps_new$PC1
if(cor(pca_samps_new$PC2, pca_samps_old$PC2)<0) pca_samps_new$PC2 = -pca_samps_new$PC2
pca_samps_old = pca_samps_new
# Update DFs
pcas_samps = rbind(pcas_samps, pca_samps_new)
}
# Add Diagnosis information
pcas_samps = pcas_samps %>% mutate('ID'=substring(ID,2)) %>%
left_join(datMeta, by=c('ID'='Dissected_Sample_ID')) %>%
dplyr::select(ID, PC1, PC2, lfc, Diagnosis)
# Plot change of number of genes
ggplotly(data.frame('lfc'=lfc_list, 'n_genes'=n_genes[-1]) %>% ggplot(aes(x=lfc, y=n_genes)) +
geom_point() + geom_line() + theme_minimal() + xlab('Log Fold Change Magnitude') + ylab('DE Genes') +
ggtitle('Number of Differentially Expressed genes when modifying filtering threshold'))
rm(lfc_list, n_genes, lfc, pca_samps_new, pca_samps_old, datExpr_pca_samps)
library(ggrepel)
ggplotly(genes_info %>% mutate(gene_name = datGenes$hgnc_symbol) %>%
ggplot(aes(shrunken_log2FoldChange, padj, color=significant)) +
geom_point(alpha=0.2, aes(id = gene_name)) + scale_y_sqrt() +
xlab('LFC') + ylab('Adjusted p-value') + theme_minimal() + labs(color = 'DE')) #+
#geom_label_repel(aes(label=ifelse(shrunken_log2FoldChange>0.5 | shrunken_log2FoldChange< -0.39,
# as.character(gene_name),'')), direction = 'y', nudge_y = 0.01)
Most of the genes on this list are overexpressed in ASD and do not have a Neuronal tag
top_genes = genes_info %>% mutate(Gene = datGenes$hgnc_symbol) %>% filter(significant == TRUE) %>%
arrange(-abs(shrunken_log2FoldChange)) %>% top_n(25, wt=abs(shrunken_log2FoldChange)) %>%
mutate(Neuronal = as.logical(Neuronal), LFC = shrunken_log2FoldChange, n = 1:25)
kable(top_genes %>% dplyr::select(n, Gene, LFC, padj, Neuronal), caption = 'Top 25 DE genes ordered by LFC magnitude')
| n | Gene | LFC | padj | Neuronal |
|---|---|---|---|---|
| 1 | HSPA6 | 0.6915903 | 0.00e+00 | FALSE |
| 2 | THBS1 | 0.5417591 | 0.00e+00 | FALSE |
| 3 | IGF2BP2 | 0.5348835 | 5.00e-07 | FALSE |
| 4 | SHD | -0.5289553 | 0.00e+00 | FALSE |
| 5 | TNFRSF10D | 0.5237673 | 2.00e-07 | FALSE |
| 6 | BNC2 | 0.5015931 | 0.00e+00 | FALSE |
| 7 | VAMP1 | -0.4946280 | 0.00e+00 | TRUE |
| 8 | WEE1 | 0.4915898 | 0.00e+00 | TRUE |
| 9 | SERPINH1 | 0.4862111 | 6.30e-06 | FALSE |
| 10 | NFKB2 | 0.4842801 | 0.00e+00 | FALSE |
| 11 | CP | 0.4796249 | 5.50e-06 | FALSE |
| 12 | ITGA5 | 0.4749568 | 0.00e+00 | FALSE |
| 13 | STC1 | 0.4722304 | 1.72e-05 | FALSE |
| 14 | GPRC5A | 0.4610657 | 0.00e+00 | FALSE |
| 15 | SPTLC3 | 0.4596896 | 4.00e-07 | FALSE |
| 16 | GBP2 | 0.4574062 | 2.40e-06 | FALSE |
| 17 | SLC11A1 | 0.4572782 | 5.10e-06 | FALSE |
| 18 | VIM | 0.4550495 | 0.00e+00 | TRUE |
| 19 | PODN | 0.4546192 | 0.00e+00 | FALSE |
| 20 | ANXA2 | 0.4545567 | 5.40e-06 | FALSE |
| 21 | OLFML2B | 0.4511181 | 0.00e+00 | FALSE |
| 22 | HSPB1 | 0.4494746 | 0.00e+00 | FALSE |
| 23 | TIMP1 | 0.4466072 | 1.82e-05 | FALSE |
| 24 | CD93 | 0.4420008 | 3.40e-06 | FALSE |
| 25 | NUPR1 | 0.4369911 | 1.64e-05 | FALSE |
PCA plot of genes characterised by each LFC threshold’s corresponding set of DE genes.
The frame corresponding to LFC = -1 doesn’t correspond to any LFC, all of the genes in the dataset were used to create this plot
LFC thresholds from 0 to 0.1 seem to separate the samples by diagnosis better than the rest
LFC thresholds above 0.2 aren’t able to separate the samples by diagnosis any more
The PC values for each LFC were scaled so they were easier to compare (without this scale, as the LFC increases, the points begin to cluster more and more tightly at the centre of the plot)
ggplotly(pcas_samps %>% mutate(abs_lfc=ifelse(lfc==-1,-1,lfc)) %>%
ggplot(aes(PC1, PC2, color=Diagnosis, shape=Diagnosis)) +
geom_point(aes(frame=abs_lfc, ids=ID), alpha=0.7) + coord_fixed() +
theme_minimal() + ggtitle('PCA plot of samples modifying filtering threshold'))
# # Figure for thesis:
# p1 = pcas_samps %>% filter(lfc == 0) %>% ggplot(aes(PC1, PC2, color=Diagnosis, shape=Diagnosis)) +
# geom_point(alpha=0.7) + theme_minimal() + ggtitle('LFC = 0') + theme(legend.position = 'none') + coord_fixed()
# p2 = pcas_samps %>% filter(lfc == 0.1) %>% ggplot(aes(PC1, PC2, color=Diagnosis, shape=Diagnosis)) +
# geom_point(alpha=0.7) + theme_minimal() + ggtitle('LFC = 0.1') + theme(legend.position = 'none') + coord_fixed()
# p3 = pcas_samps %>% filter(lfc == 0.2) %>% ggplot(aes(PC1, PC2, color=Diagnosis, shape=Diagnosis)) +
# geom_point(alpha=0.7) + theme_minimal() + ggtitle('LFC = 0.2') + theme(legend.position = 'none') + coord_fixed()
# p4 = pcas_samps %>% filter(lfc == 0.3) %>% ggplot(aes(PC1, PC2, color=Diagnosis, shape=Diagnosis)) +
# geom_point(alpha=0.7) + theme_minimal() + ggtitle('LFC = 0.3') + coord_fixed()
# ggarrange(p1,p2,p3,p4, nrow=1, common.legend = TRUE, legend='bottom', widths = c(0.22, 0.25, 0.28, 0.24))
PCA plot of samples characterised by all of the genes vs characterised only by the genes found to be differentially expressed (this plot is just a side-by-side comparison of two frames from the plot above)
pca = datExpr %>% t %>% prcomp
plot_data = pca$x %>% data.frame %>% dplyr::select(PC1:PC2) %>%
mutate('ID'= datExpr %>% colnames %>% substring(2)) %>%
left_join(datMeta, by=c('ID'='Dissected_Sample_ID')) %>%
dplyr::select('ID','PC1','PC2','Diagnosis') %>%
mutate('Diagnosis' = factor(Diagnosis, levels=c('CTL','ASD')))
p1 = plot_data %>% ggplot(aes(PC1, PC2, color=Diagnosis, shape=Diagnosis)) + geom_point(alpha = 0.8) +
xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)')) +
ggtitle('Characterising samples with all genes') +
coord_fixed() + theme_minimal() + theme(legend.position = 'none')
pca = datExpr[genes_info$significant==TRUE,] %>% t %>% prcomp
plot_data = pca$x %>% data.frame %>% dplyr::select(PC1:PC2) %>%
mutate('ID'= datExpr %>% colnames %>% substring(2)) %>%
left_join(datMeta, by=c('ID'='Dissected_Sample_ID')) %>%
dplyr::select('ID','PC1','PC2','Diagnosis') %>%
mutate('Diagnosis' = factor(Diagnosis, levels=c('CTL','ASD')))
p2 = plot_data %>% ggplot(aes(PC1, PC2, color=Diagnosis, shape=Diagnosis)) + geom_point(alpha = 0.8) +
xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)')) +
ggtitle('Characterising samples with DE genes') +
coord_fixed() + theme_minimal()
ggarrange(p1,p2, nrow=1, common.legend = TRUE, legend='bottom')
rm(p1, p2, pca, plot_data)
The second principal component is strongly related to the LFC of the genes and when visualising only the differentially expressed genes, they form two clouds of points, one with the overexpressed genes and another with the underexpressed genes.
# All genes
pca = datExpr %>% prcomp
plot_data = genes_info %>% mutate(PC1 = pca$x[,1], PC2 = pca$x[,2])
pos_zero = -min(plot_data$shrunken_log2FoldChange)/(max(plot_data$shrunken_log2FoldChange)-min(plot_data$shrunken_log2FoldChange))
p1 = plot_data %>% ggplot(aes(PC1, PC2, color=shrunken_log2FoldChange)) + geom_point(alpha=0.5) +
scale_color_gradientn(colours=c('#F8766D','#faa49e','white','#00BFC4','#009499'),
values=c(0, pos_zero-0.15, pos_zero, pos_zero+0.15, 1)) +
xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)')) +
theme_minimal() + ggtitle('PCA of all genes') +
theme(legend.position = 'bottom', legend.key.width=unit(2,'cm')) + labs(color = 'LFC ')
# Only DE genes
pca = datExpr[genes_info$significant,] %>% prcomp
plot_data = genes_info %>% dplyr::filter(significant == TRUE) %>% mutate(PC1 = pca$x[,1], PC2 = pca$x[,2])
pos_zero = -min(plot_data$shrunken_log2FoldChange)/(max(plot_data$shrunken_log2FoldChange)-min(plot_data$shrunken_log2FoldChange))
p2 = plot_data %>% ggplot(aes(PC1, PC2, color=shrunken_log2FoldChange)) + geom_point(alpha=0.5) +
scale_color_gradientn(colours=c('#F8766D','#faa49e','white','#00BFC4','#009499'),
values=c(0, pos_zero-0.15, pos_zero, pos_zero+0.15, 1)) +
xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)')) +
theme_minimal() + ggtitle('PCA of differentially expressed genes') +
theme(legend.position = 'bottom', legend.key.width=unit(2,'cm')) + labs(color = 'LFC ')
ggarrange(p1,p2, nrow=1, common.legend = TRUE, legend='bottom')
rm(pca, plot_data, pos_zero, p1, p2)
The difference in mean level of expression between over and underexpressed genes is statistically significant with a p=value lower than \(10^{-4}\)
plot_data = data.frame('MeanExpr' = rowMeans(datExpr),
'Group' = ifelse(genes_info$shrunken_log2FoldChange>0,'overexpressed','underexpressed')) %>%
mutate(Group = factor(Group, levels = c('underexpressed','overexpressed')))
plot_data[genes_info$significant==TRUE,] %>% ggplot(aes(Group, MeanExpr, fill = Group)) +
geom_boxplot(outlier.colour='gray', outlier.shape='o', outlier.size=3) +
stat_compare_means(label = 'p.signif', method = 't.test', method.args = list(var.equal = FALSE)) +
ylab('Mean Expression') + xlab('') + theme(legend.position = 'none') +
theme_minimal() + theme(legend.position = 'none')
sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.6 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] ggrepel_0.9.1 knitr_1.32
## [3] biomaRt_2.40.5 DESeq2_1.24.0
## [5] SummarizedExperiment_1.14.1 DelayedArray_0.10.0
## [7] BiocParallel_1.18.1 matrixStats_0.60.1
## [9] Biobase_2.44.0 GenomicRanges_1.36.1
## [11] GenomeInfoDb_1.20.0 IRanges_2.18.3
## [13] S4Vectors_0.22.1 BiocGenerics_0.30.0
## [15] ClusterR_1.2.1 gtools_3.9.2
## [17] expss_0.10.7 rstatix_0.6.0
## [19] Rtsne_0.15 ggpubr_0.2.5
## [21] magrittr_2.0.1 GGally_1.5.0
## [23] gridExtra_2.3 viridis_0.6.1
## [25] viridisLite_0.4.0 RColorBrewer_1.1-2
## [27] plotlyutils_0.0.0.9000 plotly_4.9.2
## [29] glue_1.4.2 reshape2_1.4.4
## [31] forcats_0.5.0 stringr_1.4.0
## [33] dplyr_1.0.1 purrr_0.3.4
## [35] readr_1.3.1 tidyr_1.1.0
## [37] tibble_3.1.2 ggplot2_3.3.5
## [39] tidyverse_1.3.0
##
## loaded via a namespace (and not attached):
## [1] readxl_1.3.1 backports_1.2.1 Hmisc_4.4-0
## [4] plyr_1.8.6 lazyeval_0.2.2 splines_3.6.3
## [7] gmp_0.6-2 crosstalk_1.1.1 digest_0.6.27
## [10] htmltools_0.5.2 fansi_0.5.0 checkmate_2.0.0
## [13] memoise_2.0.0 cluster_2.1.0 openxlsx_4.2.4
## [16] annotate_1.62.0 modelr_0.1.6 prettyunits_1.1.1
## [19] jpeg_0.1-9 colorspace_2.0-2 blob_1.2.2
## [22] rvest_0.3.5 haven_2.2.0 xfun_0.25
## [25] crayon_1.4.1 RCurl_1.98-1.4 jsonlite_1.7.2
## [28] genefilter_1.66.0 survival_3.2-7 gtable_0.3.0
## [31] zlibbioc_1.30.0 XVector_0.24.0 car_3.0-7
## [34] abind_1.4-5 scales_1.1.1 DBI_1.1.1
## [37] Rcpp_1.0.7 progress_1.2.2 xtable_1.8-4
## [40] htmlTable_1.13.3 foreign_0.8-76 bit_4.0.4
## [43] Formula_1.2-4 htmlwidgets_1.5.3 httr_1.4.2
## [46] acepack_1.4.1 ellipsis_0.3.2 farver_2.1.0
## [49] pkgconfig_2.0.3 reshape_0.8.8 XML_3.99-0.3
## [52] nnet_7.3-14 sass_0.4.0 dbplyr_1.4.2
## [55] locfit_1.5-9.4 utf8_1.2.2 labeling_0.4.2
## [58] tidyselect_1.1.1 rlang_0.4.11 AnnotationDbi_1.46.1
## [61] cachem_1.0.6 munsell_0.5.0 cellranger_1.1.0
## [64] tools_3.6.3 cli_3.0.1 generics_0.1.0
## [67] RSQLite_2.2.0 broom_0.7.0 evaluate_0.14
## [70] fastmap_1.1.0 yaml_2.2.1 bit64_4.0.5
## [73] fs_1.5.0 zip_2.2.0 xml2_1.3.2
## [76] compiler_3.6.3 rstudioapi_0.13 curl_4.3.2
## [79] png_0.1-7 ggsignif_0.6.2 reprex_0.3.0
## [82] geneplotter_1.62.0 bslib_0.3.0 stringi_1.7.4
## [85] highr_0.9 lattice_0.20-41 Matrix_1.2-18
## [88] vctrs_0.3.8 pillar_1.6.2 lifecycle_1.0.0
## [91] jquerylib_0.1.4 cowplot_1.1.1 data.table_1.14.0
## [94] bitops_1.0-7 R6_2.5.1 latticeExtra_0.6-29
## [97] rio_0.5.16 assertthat_0.2.1 withr_2.4.2
## [100] GenomeInfoDbData_1.2.1 hms_1.1.0 grid_3.6.3
## [103] rpart_4.1-15 rmarkdown_2.7 carData_3.0-4
## [106] lubridate_1.7.10 base64enc_0.1-3